CSE314 Group Project
  • Project Overview
  • Data Overview
  • Data Preprocess
  • Data Visualization
  • Demo Feature Selection
  • Database
  • Model - Logistics
  • Model - Random Forest
    • Result
    • Result
  • Model Examination
  • Kaggle Competition
CSE314 Group Project
  • »
  • Model - Random Forest

Model - Random Forest¶

In [2]:
Copied!
from pandas_profiling import ProfileReport
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport import pandas as pd import numpy as np
In [3]:
Copied!
df = pd.read_csv('train.csv')
df = pd.read_csv('train.csv')

Result¶

In [11]:
Copied!
df.head()
df.head()
Out[11]:
raw_row_number location county_name subject_age subject_race subject_sex officer_id_hash department_name type arrest_made ... outcome frisk_performed search_conducted search_person search_vehicle reason_for_stop raw_Ethnicity raw_Race raw_action_description date_time
0 12511107 NaN forsyth county 18.0 white male f2f6b08c97 Winston-Salem Police Department vehicular False ... citation False False False False Speed Limit Violation N W Citation Issued 2010-12-05 01:51:24
1 5439683 raleigh wake county 25.0 hispanic male 1e3fa73f20 Raleigh Police Department vehicular False ... warning False True True True Vehicle Regulatory Violation H W Verbal Warning 2005-09-25 03:40:00
2 18674698 charlotte area mecklenburg county 30.0 black female 59a754eb04 Charlotte-Mecklenburg Police Department vehicular False ... warning False False False False Speed Limit Violation N B Verbal Warning 2014-11-15 02:00:00
3 12600300 charlotte area mecklenburg county 21.0 white male 0dc507ea69 Charlotte-Mecklenburg Police Department vehicular False ... warning False False False False Vehicle Regulatory Violation N W Verbal Warning 2011-01-23 00:16:00
4 6035053 NaN durham county 38.0 black female 91822b2dfe Durham Police Department vehicular False ... citation False False False False Speed Limit Violation N B Citation Issued 2006-06-18 10:17:17

5 rows × 22 columns

In [7]:
Copied!
profile = ProfileReport(df, title="Police Stop Profiling Report")
profile.to_file
profile = ProfileReport(df, title="Police Stop Profiling Report") profile.to_file
Summarize dataset: 100%|██████████| 41/41 [06:19<00:00,  9.25s/it, Completed]                               
Generate report structure: 100%|██████████| 1/1 [00:07<00:00,  7.29s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.73s/it]
Out[7]:

In [4]:
Copied!
#Getting rid of unneccessary columns 
df["outcome"].replace(['warning', 'citation', 'arrest'], [0, 1, 2], inplace=True)
df.drop(columns=['date_time', 'department_name', 'county_name', 'type', 'raw_Race', 'raw_Ethnicity', 'raw_action_description', 'subject_race', 'location', 'raw_row_number', 'officer_id_hash', 'arrest_made', 'citation_issued', 'warning_issued'], inplace=True)
#Getting rid of unneccessary columns df["outcome"].replace(['warning', 'citation', 'arrest'], [0, 1, 2], inplace=True) df.drop(columns=['date_time', 'department_name', 'county_name', 'type', 'raw_Race', 'raw_Ethnicity', 'raw_action_description', 'subject_race', 'location', 'raw_row_number', 'officer_id_hash', 'arrest_made', 'citation_issued', 'warning_issued'], inplace=True)
In [5]:
Copied!
#Change reason for stop and boolean columns to numerical
df = pd.concat([df, df['reason_for_stop'].str.get_dummies()], axis=1)
df.drop(columns=['reason_for_stop'], inplace=True)
df['subject_sex'].replace({'male': 1, 'female': 0}, inplace=True)
df['frisk_performed'].replace({True: 1, False: 0}, inplace=True)
df['search_conducted'].replace({True: 1, False: 0}, inplace=True)
df['search_person'].replace({True: 1, False: 0}, inplace=True)
df['search_vehicle'].replace({True: 1, False: 0}, inplace=True)
#Change reason for stop and boolean columns to numerical df = pd.concat([df, df['reason_for_stop'].str.get_dummies()], axis=1) df.drop(columns=['reason_for_stop'], inplace=True) df['subject_sex'].replace({'male': 1, 'female': 0}, inplace=True) df['frisk_performed'].replace({True: 1, False: 0}, inplace=True) df['search_conducted'].replace({True: 1, False: 0}, inplace=True) df['search_person'].replace({True: 1, False: 0}, inplace=True) df['search_vehicle'].replace({True: 1, False: 0}, inplace=True)
In [76]:
Copied!
ProfileReport(df, title='Useful Profile Report')
ProfileReport(df, title='Useful Profile Report')
Summarize dataset: 100%|██████████| 32/32 [09:45<00:00, 18.29s/it, Completed]                                     
Generate report structure: 100%|██████████| 1/1 [00:09<00:00,  9.28s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.35s/it]
Out[76]:

In [6]:
Copied!
#read test files and also perform the same changes
test = pd.read_csv('test.csv')
test["outcome"].replace(['warning', 'citation', 'arrest'], [0, 1, 2], inplace=True)
test.drop(columns=['date_time', 'department_name', 'county_name', 'type', 'raw_Race', 'raw_Ethnicity', 'raw_action_description', 'subject_race', 'location', 'raw_row_number', 'officer_id_hash', 'arrest_made', 'citation_issued', 'warning_issued'], inplace=True)
test = pd.concat([test, test['reason_for_stop'].str.get_dummies()], axis=1)
test.drop(columns=['reason_for_stop'], inplace=True)
test['subject_sex'].replace({'male': 1, 'female': 0}, inplace=True)
test['frisk_performed'].replace({True: 1, False: 0}, inplace=True)
test['search_conducted'].replace({True: 1, False: 0}, inplace=True)
test['search_person'].replace({True: 1, False: 0}, inplace=True)
test['search_vehicle'].replace({True: 1, False: 0}, inplace=True)
#read test files and also perform the same changes test = pd.read_csv('test.csv') test["outcome"].replace(['warning', 'citation', 'arrest'], [0, 1, 2], inplace=True) test.drop(columns=['date_time', 'department_name', 'county_name', 'type', 'raw_Race', 'raw_Ethnicity', 'raw_action_description', 'subject_race', 'location', 'raw_row_number', 'officer_id_hash', 'arrest_made', 'citation_issued', 'warning_issued'], inplace=True) test = pd.concat([test, test['reason_for_stop'].str.get_dummies()], axis=1) test.drop(columns=['reason_for_stop'], inplace=True) test['subject_sex'].replace({'male': 1, 'female': 0}, inplace=True) test['frisk_performed'].replace({True: 1, False: 0}, inplace=True) test['search_conducted'].replace({True: 1, False: 0}, inplace=True) test['search_person'].replace({True: 1, False: 0}, inplace=True) test['search_vehicle'].replace({True: 1, False: 0}, inplace=True)
In [7]:
Copied!
#Seperate data for modeling
X_train = df.drop('outcome', axis=1)
X_test = test.drop('outcome', axis=1)
Y_train = df['outcome']
Y_test = test['outcome']
#Seperate data for modeling X_train = df.drop('outcome', axis=1) X_test = test.drop('outcome', axis=1) Y_train = df['outcome'] Y_test = test['outcome']
In [10]:
Copied!
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score

#Test a model
model = LogisticRegression(solver = 'liblinear')
model.fit(X_train,Y_train)
Y_pred = model.predict(X_test)
from sklearn.linear_model import LogisticRegression from sklearn.metrics import r2_score #Test a model model = LogisticRegression(solver = 'liblinear') model.fit(X_train,Y_train) Y_pred = model.predict(X_test)

Result¶

In [ ]:
Copied!
#Accuracy
np.mean(Y_test==Y_pred)
#Accuracy np.mean(Y_test==Y_pred)
Out[ ]:
0.661227829379674
In [11]:
Copied!
#r^2 score
r2_score(Y_test, Y_pred)
#r^2 score r2_score(Y_test, Y_pred)
Out[11]:
-0.27816842786267326

Not Very Accurate!

In [19]:
Copied!
#Try random forest
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 100, random_state = 0)

rf.fit(X_train, Y_train)
Y_pred = rf.predict(X_test)
#Try random forest from sklearn.ensemble import RandomForestRegressor rf = RandomForestRegressor(n_estimators = 100, random_state = 0) rf.fit(X_train, Y_train) Y_pred = rf.predict(X_test)
In [22]:
Copied!
#Accuracy
np.mean(Y_test==Y_pred)
#Accuracy np.mean(Y_test==Y_pred)
Out[22]:
0.00021062523290290175
In [23]:
Copied!
#r^2 score
r2_score(Y_test, Y_pred)
#r^2 score r2_score(Y_test, Y_pred)
Out[23]:
0.1516346829529137
Previous Next

Built with MkDocs using a theme provided by Read the Docs.
« Previous Next »